When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.
HEAD ======= >>>>>>> 0dc063d26dd6533029c726c4c7dd851ef93710f5 ======= slides > slide:not(.nobackground):before { font-size: 12pt; content: ""; position: absolute; bottom: 20px; left: 60px; background: url(cu100.png) no-repeat 0 50%; -webkit-background-size: 30px 30px; -moz-background-size: 30px 30px; -o-background-size: 30px 30px; background-size: 30px 30px; padding-left: 40px; height: 30px; line-height: 1.9; } >>>>>>> 0dc063d26dd6533029c726c4c7dd851ef93710f5
4/6/2017
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.
# Prepare iris
set.seed(567)
ind <- sample(2, nrow(iris), replace = TRUE, prob = c(0.7,
0.3))
traindata <- iris[ind == 1, ]
testdata <- iris[ind == 2, ]
table(traindata$Species)
## ## setosa versicolor virginica ## 35 37 35
library(party)
myFormula <- Species ~ Sepal.Length + Sepal.Width + Petal.Length +
Petal.Width
iris_ctree <- ctree(myFormula, data = traindata)
library(caret) trainPred = predict(iris_ctree, traindata) confusionMatrix(traindata$Species, trainPred)>>>>>>> 0dc063d26dd6533029c726c4c7dd851ef93710f5
## Confusion Matrix and Statistics ## ## Reference ## Prediction setosa versicolor virginica ## setosa 35 0 0 ## versicolor 0 36 1 ## virginica 0 3 32 ## ## Overall Statistics ## ## Accuracy : 0.9626 ## 95% CI : (0.907, 0.9897) ## No Information Rate : 0.3645 ## P-Value [Acc > NIR] : < 2.2e-16 ## ## Kappa : 0.9439 ## Mcnemar's Test P-Value : NA ## ## Statistics by Class: ## ## Class: setosa Class: versicolor Class: virginica ## Sensitivity 1.0000 0.9231 0.9697 ## Specificity 1.0000 0.9853 0.9595 ## Pos Pred Value 1.0000 0.9730 0.9143 ## Neg Pred Value 1.0000 0.9571 0.9861 ## Prevalence 0.3271 0.3645 0.3084 ## Detection Rate 0.3271 0.3364 0.2991 ## Detection Prevalence 0.3271 0.3458 0.3271 <<<<<<< HEAD ## Balanced Accuracy 1.0000 0.9542 0.9646
library(stats) set.seed(101) km <- kmeans(iris[, 1:4], 3) plot(iris[, 1], iris[, 2], col = km$cluster) points(km$centers[, c(1, 2)], col = 1:2, pch = 19, cex = 2)
table(km$cluster, iris$Species)
## ## setosa versicolor virginica ## 1 0 48 14 ## 2 50 0 0 ## 3 0 2 36
set.seed(900) km <- kmeans(iris[, 1:4], 3) plot(iris[, 1], iris[, 2], col = km$cluster) points(km$centers[, c(1, 2)], col = 1:3, pch = 19, cex = 2)
## ## setosa versicolor virginica ## 1 0 46 50 ## 2 17 4 0 ## 3 33 0 0
set.seed(101) sampleiris <- iris[sample(1:150, 40), ] # get samples from iris dataset # each observation has 4 variables, ie, they are # interpreted as 4-D points distance <- dist(sampleiris[, -5], method = "euclidean") cluster <- hclust(distance, method = "average")
plot(cluster, hang = -1, label = sampleiris$Species)
group.3 <- cutree(cluster, k = 3) # prune the tree table(group.3, sampleiris$Species)
## ## group.3 setosa versicolor virginica ## 1 0 15 9 ## 2 13 0 0 ## 3 0 0 3
par(mfrow = c(1, 2))
plot(sampleiris[, c(1, 2)], col = group.3, pch = 19, cex = 1,
main = "3 clusters")
plot(sampleiris[, c(1, 2)], col = sampleiris$Species, pch = 19,
cex = 1, main = "real clusters")
Support: The rule holds with support sup in T (the transaction data set) if sup % of transactions contain X Y.
Confidence: The rule holds in T with confidence conf if conf% of tranactions that contain X also contain Y.
Lift : The Lift of the rule is X=>Y is the confidence of the rule divided by the expected confidence, assuming that the item sets are independent.
# Loead the libraries library(registry) library(Matrix) library(arules) library(arulesViz) library(datasets) # Load the data set data(Groceries)
# Create an item frequency plot for the top 20 items itemFrequencyPlot(Groceries, topN = 20, type = "absolute")
rules <- apriori(Groceries, parameter = list(supp = 0.001,
conf = 0.8))
## Apriori ## ## Parameter specification: ## confidence minval smax arem aval originalSupport support minlen maxlen ## 0.8 0.1 1 none FALSE TRUE 0.001 1 10 ## target ext ## rules FALSE ## ## Algorithmic control: ## filter tree heap memopt load sort verbose ## 0.1 TRUE TRUE FALSE TRUE 2 TRUE ## ## Absolute minimum support count: 9 ## ## set item appearances ...[0 item(s)] done [0.00s]. ## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s]. ## sorting and recoding items ... [157 item(s)] done [0.00s]. ## creating transaction tree ... done [0.00s]. ## checking subsets of size 1 2 3 4 5 6 done [0.01s]. ## writing ... [410 rule(s)] done [0.00s]. ## creating S4 object ... done [0.01s].
# Show the top 5 rules, but only 2 digits options(digits = 2) inspect(rules[1:5])
## lhs rhs support confidence lift
## 1 {liquor,red/blush wine} => {bottled beer} 0.0019 0.90 11.2
## 2 {curd,cereals} => {whole milk} 0.0010 0.91 3.6
## 3 {yogurt,cereals} => {whole milk} 0.0017 0.81 3.2
## 4 {butter,jam} => {whole milk} 0.0010 0.83 3.3
## 5 {soups,bottled beer} => {whole milk} 0.0011 0.92 3.6
plot(rules)
rules <- sort(rules, by = "confidence", decreasing = TRUE) inspect(rules[1:5])
## lhs rhs support confidence lift
## 1 {rice,
## sugar} => {whole milk} 0.0012 1 3.9
## 2 {canned fish,
## hygiene articles} => {whole milk} 0.0011 1 3.9
## 3 {root vegetables,
## butter,
## rice} => {whole milk} 0.0010 1 3.9
## 4 {root vegetables,
## whipped/sour cream,
## flour} => {whole milk} 0.0017 1 3.9
## 5 {butter,
## soft cheese,
## domestic eggs} => {whole milk} 0.0010 1 3.9
plot(rules, method = "grouped")
rules <- apriori(Groceries, parameter = list(supp = 0.001,
conf = 0.8, maxlen = 3))
## Apriori ## ## Parameter specification: ## confidence minval smax arem aval originalSupport support minlen maxlen ## 0.8 0.1 1 none FALSE TRUE 0.001 1 3 ## target ext ## rules FALSE ## ## Algorithmic control: ## filter tree heap memopt load sort verbose ## 0.1 TRUE TRUE FALSE TRUE 2 TRUE ## ## Absolute minimum support count: 9 ## ## set item appearances ...[0 item(s)] done [0.00s]. ## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s]. ## sorting and recoding items ... [157 item(s)] done [0.00s]. ## creating transaction tree ... done [0.00s]. ## checking subsets of size 1 2 3 done [0.00s]. ## writing ... [29 rule(s)] done [0.00s]. ## creating S4 object ... done [0.00s].
inspect(rules[1:5])
## lhs rhs support confidence lift
## 1 {liquor,red/blush wine} => {bottled beer} 0.0019 0.90 11.2
## 2 {curd,cereals} => {whole milk} 0.0010 0.91 3.6
## 3 {yogurt,cereals} => {whole milk} 0.0017 0.81 3.2
## 4 {butter,jam} => {whole milk} 0.0010 0.83 3.3
## 5 {soups,bottled beer} => {whole milk} 0.0011 0.92 3.6
plot(rules, method = "graph")
subset.matrix <- is.subset(rules, rules) subset.matrix[lower.tri(subset.matrix, diag = T)] <- NA redundant <- colSums(subset.matrix, na.rm = T) >= 1 rules.pruned <- rules[!redundant] rules <- rules.pruned
summary(rules)
## set of 29 rules ## ## rule length distribution (lhs + rhs):sizes ## 3 ## 29 ## ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 3 3 3 3 3 3 ## ## summary of quality measures: ## support confidence lift ## Min. :0.00102 Min. :0.80 Min. : 3.1 ## 1st Qu.:0.00112 1st Qu.:0.81 1st Qu.: 3.3 ## Median :0.00122 Median :0.85 Median : 3.6 ## Mean :0.00147 Mean :0.86 Mean : 4.0 ## 3rd Qu.:0.00173 3rd Qu.:0.91 3rd Qu.: 4.2 ## Max. :0.00254 Max. :1.00 Max. :11.2 ## ## mining info: ## data ntransactions support confidence ## Groceries 9835 0.001 0.8
rules <- apriori(data = Groceries, parameter = list(supp = 0.001,
conf = 0.08), appearance = list(default = "lhs", rhs = "whole milk"),
control = list(verbose = F))
rules <- sort(rules, decreasing = TRUE, by = "confidence")
inspect(rules[1:5])
## lhs rhs support confidence lift
## 1 {rice,
## sugar} => {whole milk} 0.0012 1 3.9
## 2 {canned fish,
## hygiene articles} => {whole milk} 0.0011 1 3.9
## 3 {root vegetables,
## butter,
## rice} => {whole milk} 0.0010 1 3.9
## 4 {root vegetables,
## whipped/sour cream,
## flour} => {whole milk} 0.0017 1 3.9
## 5 {butter,
## soft cheese,
## domestic eggs} => {whole milk} 0.0010 1 3.9
rules <- apriori(data = Groceries, parameter = list(supp = 0.001,
conf = 0.15, minlen = 2), appearance = list(default = "rhs",
lhs = "whole milk"), control = list(verbose = F))
rules <- sort(rules, decreasing = TRUE, by = "confidence")
inspect(rules[1:5])
## lhs rhs support confidence lift
## 6 {whole milk} => {other vegetables} 0.075 0.29 1.5
## 5 {whole milk} => {rolls/buns} 0.057 0.22 1.2
## 4 {whole milk} => {yogurt} 0.056 0.22 1.6
## 2 {whole milk} => {root vegetables} 0.049 0.19 1.8
## 1 {whole milk} => {tropical fruit} 0.042 0.17 1.6
Bagging and Boosting using R
library(ggplot2) library(randomForest) # Train 500 trees, random selected attributes model <- randomForest(Species ~ ., data = traindata, nTree = 500) prediction <- predict(model, newdata = testdata, type = "class") table(prediction, testdata$Species)
## ## prediction setosa versicolor virginica ## setosa 15 0 0 ## versicolor 0 13 2 ## virginica 0 0 13
library(adabag)
iris.adaboost <- boosting(Species ~ ., data = traindata, boost = TRUE,
mfinal = 5)
iris.adaboost
## $formula ## Species ~ . ## ## $trees ## $trees[[1]] ## n= 107 ## ## node), split, n, loss, yval, (yprob) ## * denotes terminal node ## ## 1) root 107 65 versicolor (0.262 0.393 0.346) ## 2) Petal.Width< 1.6 69 29 versicolor (0.406 0.580 0.014) ## 4) Petal.Length< 2.6 28 0 setosa (1.000 0.000 0.000) * ## 5) Petal.Length>=2.6 41 1 versicolor (0.000 0.976 0.024) * ## 3) Petal.Width>=1.6 38 2 virginica (0.000 0.053 0.947) * ## ## $trees[[2]] ## n= 107 ## ## node), split, n, loss, yval, (yprob) ## * denotes terminal node ## ## 1) root 107 64 virginica (0.252 0.346 0.402) ## 2) Petal.Length< 5 67 30 versicolor (0.403 0.552 0.045) ## 4) Petal.Length< 2.5 27 0 setosa (1.000 0.000 0.000) * ## 5) Petal.Length>=2.5 40 3 versicolor (0.000 0.925 0.075) * ## 3) Petal.Length>=5 40 0 virginica (0.000 0.000 1.000) * ## ## $trees[[3]] ## n= 107 ## ## node), split, n, loss, yval, (yprob) ## * denotes terminal node ## ## 1) root 107 64 virginica (0.215 0.383 0.402) ## 2) Petal.Length< 2.4 23 0 setosa (1.000 0.000 0.000) * ## 3) Petal.Length>=2.4 84 41 virginica (0.000 0.488 0.512) ## 6) Petal.Width< 1.6 33 3 versicolor (0.000 0.909 0.091) * ## 7) Petal.Width>=1.6 51 11 virginica (0.000 0.216 0.784) ## 14) Sepal.Width>=3.2 8 2 versicolor (0.000 0.750 0.250) * ## 15) Sepal.Width< 3.2 43 5 virginica (0.000 0.116 0.884) * ## ## $trees[[4]] ## n= 107 ## ## node), split, n, loss, yval, (yprob) ## * denotes terminal node ## ## 1) root 107 60 virginica (0.271 0.290 0.439) ## 2) Petal.Length< 2.5 29 0 setosa (1.000 0.000 0.000) * ## 3) Petal.Length>=2.5 78 31 virginica (0.000 0.397 0.603) ## 6) Petal.Length< 4.8 21 2 versicolor (0.000 0.905 0.095) * ## 7) Petal.Length>=4.8 57 12 virginica (0.000 0.211 0.789) ## 14) Petal.Width< 1.9 32 12 virginica (0.000 0.375 0.625) ## 28) Sepal.Length< 6.1 13 4 versicolor (0.000 0.692 0.308) * ## 29) Sepal.Length>=6.1 19 3 virginica (0.000 0.158 0.842) * ## 15) Petal.Width>=1.9 25 0 virginica (0.000 0.000 1.000) * ## ## $trees[[5]] ## n= 107 ## ## node), split, n, loss, yval, (yprob) ## * denotes terminal node ## ## 1) root 107 63 virginica (0.187 0.402 0.411) ## 2) Petal.Length< 2.4 20 0 setosa (1.000 0.000 0.000) * ## 3) Petal.Length>=2.4 87 43 virginica (0.000 0.494 0.506) ## 6) Petal.Width< 1.8 54 13 versicolor (0.000 0.759 0.241) ## 12) Petal.Length< 5 40 5 versicolor (0.000 0.875 0.125) ## 24) Sepal.Length>=5.2 33 0 versicolor (0.000 1.000 0.000) * ## 25) Sepal.Length< 5.2 7 2 virginica (0.000 0.286 0.714) * ## 13) Petal.Length>=5 14 6 virginica (0.000 0.429 0.571) * ## 7) Petal.Width>=1.8 33 2 virginica (0.000 0.061 0.939) * ## ## ## $weights ## [1] 1.62 1.41 0.83 0.95 1.17 ## ## $votes ## [,1] [,2] [,3] ## [1,] 6 0.00 0.00 ## [2,] 6 0.00 0.00 ## [3,] 6 0.00 0.00 ## [4,] 6 0.00 0.00 ## [5,] 6 0.00 0.00 ## [6,] 6 0.00 0.00 ## [7,] 6 0.00 0.00 ## [8,] 6 0.00 0.00 ## [9,] 6 0.00 0.00 ## [10,] 6 0.00 0.00 ## [11,] 6 0.00 0.00 ## [12,] 6 0.00 0.00 ## [13,] 6 0.00 0.00 ## [14,] 6 0.00 0.00 ## [15,] 6 0.00 0.00 ## [16,] 6 0.00 0.00 ## [17,] 6 0.00 0.00 ## [18,] 6 0.00 0.00 ## [19,] 6 0.00 0.00 ## [20,] 6 0.00 0.00 ## [21,] 6 0.00 0.00 ## [22,] 6 0.00 0.00 ## [23,] 6 0.00 0.00 ## [24,] 6 0.00 0.00 ## [25,] 6 0.00 0.00 ## [26,] 6 0.00 0.00 ## [27,] 6 0.00 0.00 ## [28,] 6 0.00 0.00 ## [29,] 6 0.00 0.00 ## [30,] 6 0.00 0.00 ## [31,] 6 0.00 0.00 ## [32,] 6 0.00 0.00 ## [33,] 6 0.00 0.00 ## [34,] 6 0.00 0.00 ## [35,] 6 0.00 0.00 ## [36,] 0 5.98 0.00 ## [37,] 0 5.98 0.00 ## [38,] 0 5.03 0.95 ## [39,] 0 5.98 0.00 ## [40,] 0 4.81 1.17 ## [41,] 0 5.98 0.00 ## [42,] 0 4.81 1.17 ## [43,] 0 4.81 1.17 ## [44,] 0 5.98 0.00 ## [45,] 0 5.98 0.00 ## [46,] 0 5.98 0.00 ## [47,] 0 5.98 0.00 ## [48,] 0 5.98 0.00 ## [49,] 0 5.98 0.00 ## [50,] 0 3.18 2.80 ## [51,] 0 5.98 0.00 ## [52,] 0 5.98 0.00 ## [53,] 0 5.03 0.95 ## [54,] 0 2.58 3.40 ## [55,] 0 5.98 0.00 ## [56,] 0 5.98 0.00 ## [57,] 0 5.98 0.00 ## [58,] 0 2.57 3.41 ## [59,] 0 5.98 0.00 ## [60,] 0 5.98 0.00 ## [61,] 0 5.98 0.00 ## [62,] 0 5.98 0.00 ## [63,] 0 5.98 0.00 ## [64,] 0 5.98 0.00 ## [65,] 0 5.98 0.00 ## [66,] 0 5.98 0.00 ## [67,] 0 5.98 0.00 ## [68,] 0 5.98 0.00 ## [69,] 0 5.98 0.00 ## [70,] 0 5.98 0.00 ## [71,] 0 4.81 1.17 ## [72,] 0 5.98 0.00 ## [73,] 0 0.00 5.98 ## [74,] 0 0.00 5.98 ## [75,] 0 0.00 5.98 ## [76,] 0 0.00 5.98 ## [77,] 0 2.35 3.63 ## [78,] 0 0.83 5.15 ## [79,] 0 0.83 5.15 ## [80,] 0 0.00 5.98 ## [81,] 0 0.00 5.98 ## [82,] 0 0.00 5.98 ## [83,] 0 0.83 5.15 ## [84,] 0 0.00 5.98 ## [85,] 0 0.83 5.15 ## [86,] 0 0.00 5.98 ## [87,] 0 0.83 5.15 ## [88,] 0 1.41 4.57 ## [89,] 0 1.41 4.57 ## [90,] 0 1.41 4.57 ## [91,] 0 0.00 5.98 ## [92,] 0 1.62 4.36 ## [93,] 0 0.00 5.98 ## [94,] 0 0.83 5.15 ## [95,] 0 2.45 3.53 ## [96,] 0 0.00 5.98 ## [97,] 0 0.83 5.15 ## [98,] 0 0.00 5.98 ## [99,] 0 2.35 3.63 ## [100,] 0 0.00 5.98 ## [101,] 0 0.00 5.98 ## [102,] 0 0.00 5.98 ## [103,] 0 0.83 5.15 ## [104,] 0 0.00 5.98 ## [105,] 0 1.41 4.57 ## [106,] 0 0.83 5.15 ## [107,] 0 0.95 5.03 ## ## $prob ## [,1] [,2] [,3] ## [1,] 1 0.00 0.00 ## [2,] 1 0.00 0.00 ## [3,] 1 0.00 0.00 ## [4,] 1 0.00 0.00 ## [5,] 1 0.00 0.00 ## [6,] 1 0.00 0.00 ## [7,] 1 0.00 0.00 ## [8,] 1 0.00 0.00 ## [9,] 1 0.00 0.00 ## [10,] 1 0.00 0.00 ## [11,] 1 0.00 0.00 ## [12,] 1 0.00 0.00 ## [13,] 1 0.00 0.00 ## [14,] 1 0.00 0.00 ## [15,] 1 0.00 0.00 ## [16,] 1 0.00 0.00 ## [17,] 1 0.00 0.00 ## [18,] 1 0.00 0.00 ## [19,] 1 0.00 0.00 ## [20,] 1 0.00 0.00 ## [21,] 1 0.00 0.00 ## [22,] 1 0.00 0.00 ## [23,] 1 0.00 0.00 ## [24,] 1 0.00 0.00 ## [25,] 1 0.00 0.00 ## [26,] 1 0.00 0.00 ## [27,] 1 0.00 0.00 ## [28,] 1 0.00 0.00 ## [29,] 1 0.00 0.00 ## [30,] 1 0.00 0.00 ## [31,] 1 0.00 0.00 ## [32,] 1 0.00 0.00 ## [33,] 1 0.00 0.00 ## [34,] 1 0.00 0.00 ## [35,] 1 0.00 0.00 ## [36,] 0 1.00 0.00 ## [37,] 0 1.00 0.00 ## [38,] 0 0.84 0.16 ## [39,] 0 1.00 0.00 ## [40,] 0 0.80 0.20 ## [41,] 0 1.00 0.00 ## [42,] 0 0.80 0.20 ## [43,] 0 0.80 0.20 ## [44,] 0 1.00 0.00 ## [45,] 0 1.00 0.00 ## [46,] 0 1.00 0.00 ## [47,] 0 1.00 0.00 ## [48,] 0 1.00 0.00 ## [49,] 0 1.00 0.00 ## [50,] 0 0.53 0.47 ## [51,] 0 1.00 0.00 ## [52,] 0 1.00 0.00 ## [53,] 0 0.84 0.16 ## [54,] 0 0.43 0.57 ## [55,] 0 1.00 0.00 ## [56,] 0 1.00 0.00 ## [57,] 0 1.00 0.00 ## [58,] 0 0.43 0.57 ## [59,] 0 1.00 0.00 ## [60,] 0 1.00 0.00 ## [61,] 0 1.00 0.00 ## [62,] 0 1.00 0.00 ## [63,] 0 1.00 0.00 ## [64,] 0 1.00 0.00 ## [65,] 0 1.00 0.00 ## [66,] 0 1.00 0.00 ## [67,] 0 1.00 0.00 ## [68,] 0 1.00 0.00 ## [69,] 0 1.00 0.00 ## [70,] 0 1.00 0.00 ## [71,] 0 0.80 0.20 ## [72,] 0 1.00 0.00 ## [73,] 0 0.00 1.00 ## [74,] 0 0.00 1.00 ## [75,] 0 0.00 1.00 ## [76,] 0 0.00 1.00 ## [77,] 0 0.39 0.61 ## [78,] 0 0.14 0.86 ## [79,] 0 0.14 0.86 ## [80,] 0 0.00 1.00 ## [81,] 0 0.00 1.00 ## [82,] 0 0.00 1.00 ## [83,] 0 0.14 0.86 ## [84,] 0 0.00 1.00 ## [85,] 0 0.14 0.86 ## [86,] 0 0.00 1.00 ## [87,] 0 0.14 0.86 ## [88,] 0 0.23 0.77 ## [89,] 0 0.23 0.77 ## [90,] 0 0.23 0.77 ## [91,] 0 0.00 1.00 ## [92,] 0 0.27 0.73 ## [93,] 0 0.00 1.00 ## [94,] 0 0.14 0.86 ## [95,] 0 0.41 0.59 ## [96,] 0 0.00 1.00 ## [97,] 0 0.14 0.86 ## [98,] 0 0.00 1.00 ## [99,] 0 0.39 0.61 ## [100,] 0 0.00 1.00 ## [101,] 0 0.00 1.00 ## [102,] 0 0.00 1.00 ## [103,] 0 0.14 0.86 ## [104,] 0 0.00 1.00 ## [105,] 0 0.23 0.77 ## [106,] 0 0.14 0.86 ## [107,] 0 0.16 0.84 ## ## $class ## [1] "setosa" "setosa" "setosa" "setosa" "setosa" ## [6] "setosa" "setosa" "setosa" "setosa" "setosa" ## [11] "setosa" "setosa" "setosa" "setosa" "setosa" ## [16] "setosa" "setosa" "setosa" "setosa" "setosa" ## [21] "setosa" "setosa" "setosa" "setosa" "setosa" ## [26] "setosa" "setosa" "setosa" "setosa" "setosa" ## [31] "setosa" "setosa" "setosa" "setosa" "setosa" ## [36] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor" ## [41] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor" ## [46] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor" ## [51] "versicolor" "versicolor" "versicolor" "virginica" "versicolor" ## [56] "versicolor" "versicolor" "virginica" "versicolor" "versicolor" ## [61] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor" ## [66] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor" ## [71] "versicolor" "versicolor" "virginica" "virginica" "virginica" ## [76] "virginica" "virginica" "virginica" "virginica" "virginica" ## [81] "virginica" "virginica" "virginica" "virginica" "virginica" ## [86] "virginica" "virginica" "virginica" "virginica" "virginica" ## [91] "virginica" "virginica" "virginica" "virginica" "virginica" ## [96] "virginica" "virginica" "virginica" "virginica" "virginica" ## [101] "virginica" "virginica" "virginica" "virginica" "virginica" ## [106] "virginica" "virginica" ## ## $importance ## Petal.Length Petal.Width Sepal.Length Sepal.Width ## 68.7 26.9 3.1 1.3 ## ## $terms ## Species ~ pesos + Sepal.Length + Sepal.Width + Petal.Length + ## Petal.Width ## attr(,"variables") ## list(Species, pesos, Sepal.Length, Sepal.Width, Petal.Length, ## Petal.Width) ## attr(,"factors") ## pesos Sepal.Length Sepal.Width Petal.Length Petal.Width ## Species 0 0 0 0 0 ## pesos 1 0 0 0 0 ## Sepal.Length 0 1 0 0 0 ## Sepal.Width 0 0 1 0 0 ## Petal.Length 0 0 0 1 0 ## Petal.Width 0 0 0 0 1 ## attr(,"term.labels") ## [1] "pesos" "Sepal.Length" "Sepal.Width" "Petal.Length" ## [5] "Petal.Width" ## attr(,"order") ## [1] 1 1 1 1 1 ## attr(,"intercept") ## [1] 1 ## attr(,"response") ## [1] 1 ## attr(,".Environment") ## <environment: R_GlobalEnv> ## attr(,"predvars") ## list(Species, pesos, Sepal.Length, Sepal.Width, Petal.Length, ## Petal.Width) ## attr(,"dataClasses") ## Species pesos Sepal.Length Sepal.Width Petal.Length ## "factor" "numeric" "numeric" "numeric" "numeric" ## Petal.Width ## "numeric" ## ## $call ## boosting(formula = Species ~ ., data = traindata, mfinal = 5, ## boost = TRUE) ## ## attr(,"vardep.summary") ## setosa versicolor virginica ## 35 37 35 ## attr(,"class") ## [1] "boosting"
barplot(iris.adaboost$imp[order(iris.adaboost$imp, decreasing = TRUE)],
ylim = c(0, 100), main = "Variables Relative Importance",
col = "lightblue")
table(iris.adaboost$class, traindata$Species, dnn = c("Predicted Class",
"Observed Class"))
## Observed Class ## Predicted Class setosa versicolor virginica ## setosa 35 0 0 ## versicolor 0 35 0 ## virginica 0 2 35
# Needed <- c('tm', 'SnowballCC', 'RColorBrewer',
# 'wordcloud', 'biclust', 'igraph', 'fpc')
# install.packages(Needed, dependencies = TRUE)
TEXTFILE = "t8.shakespeare.txt"
if (!file.exists(TEXTFILE)) {
download.file("https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt",
destfile = TEXTFILE)
}
shakespeare = readLines(TEXTFILE)
length(shakespeare)
## [1] 124456
shakespeare = shakespeare[-(1:173)] shakespeare = shakespeare[-(124195:length(shakespeare))] shakespeare = paste(shakespeare, collapse = " ") shakespeare = strsplit(shakespeare, "<<[^>]*>>")[[1]]
library(tm) docs.vec <- VectorSource(shakespeare) docs.corpus <- Corpus(docs.vec) summary(docs.corpus)
## Length Class Mode ## 1 2 PlainTextDocument list ## 2 2 PlainTextDocument list ## 3 2 PlainTextDocument list ## 4 2 PlainTextDocument list ## 5 2 PlainTextDocument list ## 6 2 PlainTextDocument list ## 7 2 PlainTextDocument list ## 8 2 PlainTextDocument list ## 9 2 PlainTextDocument list ## 10 2 PlainTextDocument list ## 11 2 PlainTextDocument list ## 12 2 PlainTextDocument list ## 13 2 PlainTextDocument list ## 14 2 PlainTextDocument list ## 15 2 PlainTextDocument list ## 16 2 PlainTextDocument list ## 17 2 PlainTextDocument list ## 18 2 PlainTextDocument list ## 19 2 PlainTextDocument list ## 20 2 PlainTextDocument list ## 21 2 PlainTextDocument list ## 22 2 PlainTextDocument list ## 23 2 PlainTextDocument list ## 24 2 PlainTextDocument list ## 25 2 PlainTextDocument list ## 26 2 PlainTextDocument list ## 27 2 PlainTextDocument list ## 28 2 PlainTextDocument list ## 29 2 PlainTextDocument list ## 30 2 PlainTextDocument list ## 31 2 PlainTextDocument list ## 32 2 PlainTextDocument list ## 33 2 PlainTextDocument list ## 34 2 PlainTextDocument list ## 35 2 PlainTextDocument list ## 36 2 PlainTextDocument list ## 37 2 PlainTextDocument list ## 38 2 PlainTextDocument list ## 39 2 PlainTextDocument list ## 40 2 PlainTextDocument list ## 41 2 PlainTextDocument list ## 42 2 PlainTextDocument list ## 43 2 PlainTextDocument list ## 44 2 PlainTextDocument list ## 45 2 PlainTextDocument list ## 46 2 PlainTextDocument list ## 47 2 PlainTextDocument list ## 48 2 PlainTextDocument list ## 49 2 PlainTextDocument list ## 50 2 PlainTextDocument list ## 51 2 PlainTextDocument list ## 52 2 PlainTextDocument list ## 53 2 PlainTextDocument list ## 54 2 PlainTextDocument list ## 55 2 PlainTextDocument list ## 56 2 PlainTextDocument list ## 57 2 PlainTextDocument list ## 58 2 PlainTextDocument list ## 59 2 PlainTextDocument list ## 60 2 PlainTextDocument list ## 61 2 PlainTextDocument list ## 62 2 PlainTextDocument list ## 63 2 PlainTextDocument list ## 64 2 PlainTextDocument list ## 65 2 PlainTextDocument list ## 66 2 PlainTextDocument list ## 67 2 PlainTextDocument list ## 68 2 PlainTextDocument list ## 69 2 PlainTextDocument list ## 70 2 PlainTextDocument list ## 71 2 PlainTextDocument list ## 72 2 PlainTextDocument list ## 73 2 PlainTextDocument list ## 74 2 PlainTextDocument list ## 75 2 PlainTextDocument list ## 76 2 PlainTextDocument list ## 77 2 PlainTextDocument list ## 78 2 PlainTextDocument list ## 79 2 PlainTextDocument list ## 80 2 PlainTextDocument list ## 81 2 PlainTextDocument list ## 82 2 PlainTextDocument list ## 83 2 PlainTextDocument list ## 84 2 PlainTextDocument list ## 85 2 PlainTextDocument list ## 86 2 PlainTextDocument list ## 87 2 PlainTextDocument list ## 88 2 PlainTextDocument list ## 89 2 PlainTextDocument list ## 90 2 PlainTextDocument list ## 91 2 PlainTextDocument list ## 92 2 PlainTextDocument list ## 93 2 PlainTextDocument list ## 94 2 PlainTextDocument list ## 95 2 PlainTextDocument list ## 96 2 PlainTextDocument list ## 97 2 PlainTextDocument list ## 98 2 PlainTextDocument list ## 99 2 PlainTextDocument list ## 100 2 PlainTextDocument list ## 101 2 PlainTextDocument list ## 102 2 PlainTextDocument list ## 103 2 PlainTextDocument list ## 104 2 PlainTextDocument list ## 105 2 PlainTextDocument list ## 106 2 PlainTextDocument list ## 107 2 PlainTextDocument list ## 108 2 PlainTextDocument list ## 109 2 PlainTextDocument list ## 110 2 PlainTextDocument list ## 111 2 PlainTextDocument list ## 112 2 PlainTextDocument list ## 113 2 PlainTextDocument list ## 114 2 PlainTextDocument list ## 115 2 PlainTextDocument list ## 116 2 PlainTextDocument list ## 117 2 PlainTextDocument list ## 118 2 PlainTextDocument list ## 119 2 PlainTextDocument list ## 120 2 PlainTextDocument list ## 121 2 PlainTextDocument list ## 122 2 PlainTextDocument list ## 123 2 PlainTextDocument list ## 124 2 PlainTextDocument list ## 125 2 PlainTextDocument list ## 126 2 PlainTextDocument list ## 127 2 PlainTextDocument list ## 128 2 PlainTextDocument list ## 129 2 PlainTextDocument list ## 130 2 PlainTextDocument list ## 131 2 PlainTextDocument list ## 132 2 PlainTextDocument list ## 133 2 PlainTextDocument list ## 134 2 PlainTextDocument list ## 135 2 PlainTextDocument list ## 136 2 PlainTextDocument list ## 137 2 PlainTextDocument list ## 138 2 PlainTextDocument list ## 139 2 PlainTextDocument list ## 140 2 PlainTextDocument list ## 141 2 PlainTextDocument list ## 142 2 PlainTextDocument list ## 143 2 PlainTextDocument list ## 144 2 PlainTextDocument list ## 145 2 PlainTextDocument list ## 146 2 PlainTextDocument list ## 147 2 PlainTextDocument list ## 148 2 PlainTextDocument list ## 149 2 PlainTextDocument list ## 150 2 PlainTextDocument list ## 151 2 PlainTextDocument list ## 152 2 PlainTextDocument list ## 153 2 PlainTextDocument list ## 154 2 PlainTextDocument list ## 155 2 PlainTextDocument list ## 156 2 PlainTextDocument list ## 157 2 PlainTextDocument list ## 158 2 PlainTextDocument list ## 159 2 PlainTextDocument list ## 160 2 PlainTextDocument list ## 161 2 PlainTextDocument list ## 162 2 PlainTextDocument list ## 163 2 PlainTextDocument list ## 164 2 PlainTextDocument list ## 165 2 PlainTextDocument list ## 166 2 PlainTextDocument list ## 167 2 PlainTextDocument list ## 168 2 PlainTextDocument list ## 169 2 PlainTextDocument list ## 170 2 PlainTextDocument list ## 171 2 PlainTextDocument list ## 172 2 PlainTextDocument list ## 173 2 PlainTextDocument list ## 174 2 PlainTextDocument list ## 175 2 PlainTextDocument list ## 176 2 PlainTextDocument list ## 177 2 PlainTextDocument list ## 178 2 PlainTextDocument list ## 179 2 PlainTextDocument list ## 180 2 PlainTextDocument list ## 181 2 PlainTextDocument list ## 182 2 PlainTextDocument list ## 183 2 PlainTextDocument list ## 184 2 PlainTextDocument list ## 185 2 PlainTextDocument list ## 186 2 PlainTextDocument list ## 187 2 PlainTextDocument list ## 188 2 PlainTextDocument list ## 189 2 PlainTextDocument list ## 190 2 PlainTextDocument list ## 191 2 PlainTextDocument list ## 192 2 PlainTextDocument list ## 193 2 PlainTextDocument list ## 194 2 PlainTextDocument list ## 195 2 PlainTextDocument list ## 196 2 PlainTextDocument list ## 197 2 PlainTextDocument list ## 198 2 PlainTextDocument list ## 199 2 PlainTextDocument list ## 200 2 PlainTextDocument list ## 201 2 PlainTextDocument list ## 202 2 PlainTextDocument list ## 203 2 PlainTextDocument list ## 204 2 PlainTextDocument list ## 205 2 PlainTextDocument list ## 206 2 PlainTextDocument list ## 207 2 PlainTextDocument list ## 208 2 PlainTextDocument list ## 209 2 PlainTextDocument list ## 210 2 PlainTextDocument list ## 211 2 PlainTextDocument list ## 212 2 PlainTextDocument list ## 213 2 PlainTextDocument list ## 214 2 PlainTextDocument list ## 215 2 PlainTextDocument list ## 216 2 PlainTextDocument list ## 217 2 PlainTextDocument list ## 218 2 PlainTextDocument list ## 219 2 PlainTextDocument list
# Remove Punctuation docs.corpus <- tm_map(docs.corpus, removePunctuation) head(docs.corpus)
## <<SimpleCorpus>> ## Metadata: corpus specific: 1, document level (indexed): 0 ## Content: documents: 6
# Remove Number
docs.corpus <- tm_map(docs.corpus, removeNumbers)
docs.corpus <- tm_map(docs.corpus, tolower)
# Remove Stopwords
docs.corpus <- tm_map(docs.corpus, removeWords, stopwords("english"))
# remove ing s, es library(SnowballC) docs.corpus <- tm_map(docs.corpus, stemDocument) docs.corpus <- tm_map(docs.corpus, stripWhitespace)
Create Document Term Matrix
# Create Document Term Matrix dtm <- DocumentTermMatrix(docs.corpus) inspect(dtm[1:10, 1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>> ## Non-/sparse entries: 19/81 ## Sparsity : 81% ## Maximal term length: 6 ## Weighting : term frequency (tf) ## Sample : ## Terms ## Docs accept addit agent allow alter altern appli aris asi associ ## 1 1 1 1 1 1 2 1 1 1 1 ## 10 0 0 0 0 0 0 0 0 0 0 ## 2 2 2 0 2 6 0 1 2 0 0 ## 3 0 0 0 0 0 0 0 0 0 0 ## 4 0 0 0 0 0 0 0 0 0 0 ## 5 0 1 0 0 0 0 0 0 0 0 ## 6 0 0 0 0 0 0 0 0 0 0 ## 7 0 1 0 0 0 0 0 0 0 0 ## 8 0 0 0 1 0 0 0 0 0 0 ## 9 0 0 0 0 0 0 0 0 0 0
# Create Term Document Matrix tdm <- TermDocumentMatrix(docs.corpus) inspect(tdm[1:10, 1:10])
## <<TermDocumentMatrix (terms: 10, documents: 10)>> ## Non-/sparse entries: 19/81 ## Sparsity : 81% ## Maximal term length: 6 ## Weighting : term frequency (tf) ## Sample : ## Docs ## Terms 1 10 2 3 4 5 6 7 8 9 ## accept 1 0 2 0 0 0 0 0 0 0 ## addit 1 0 2 0 0 1 0 1 0 0 ## agent 1 0 0 0 0 0 0 0 0 0 ## allow 1 0 2 0 0 0 0 0 1 0 ## alter 1 0 6 0 0 0 0 0 0 0 ## altern 2 0 0 0 0 0 0 0 0 0 ## appli 1 0 1 0 0 0 0 0 0 0 ## aris 1 0 2 0 0 0 0 0 0 0 ## asi 1 0 0 0 0 0 0 0 0 0 ## associ 1 0 0 0 0 0 0 0 0 0
# Explore Data freq <- colSums(as.matrix(dtm)) length(freq)
## [1] 18786
ord <- order(freq) head(ord)
## [1] 9 11 13 14 15 20
# Start by removing sparse terms: TDM.common = removeSparseTerms(tdm, 0.1) dim(tdm)
## [1] 18786 219
dim(TDM.common)
## [1] 0 219
m = as.matrix(tdm) v = sort(rowSums(m), decreasing = TRUE) d <- data.frame(word = names(v), freq = v) head(d, 10)
## word freq ## thou thou 5485 ## will will 5080 ## thi thi 4032 ## shall shall 3595 ## lord lord 3566 ## come come 3283 ## thee thee 3178 ## king king 3170 ## good good 2966 ## sir sir 2797
library(wordcloud)
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words = 200,
random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8,
"Dark2"))
email : veerasak.kr568@cbs.chula.ac.th